##########################################################
## This code to combine multiple data sources.          ##
##########################################################
rm(list=ls())
options(scipen=100,digits=10)
library(rgdal)
library(gdalUtils)
library(raster)
library(plyr)
library(dplyr)
library(maps)
library(mapdata)
library(sf)
library(tidyverse)
library(rasterVis)  # raster visualisation
library(maptools)
library(lubridate)
library(rgeos)
library(sm)
library(foreign)
setwd("processed data")
data_plant=read.csv("ECHO_PP_RID.csv") # EPA inspection data from ECHO database
data_AOD=read.csv("AOD_all.csv") # Satellite measured PM data from NASA
data_AOD=data_AOD[-1]
data_AOD$AOD_DATE=as.Date(data_AOD$AOD_DATE)
names(data_AOD)[33]="DATE"
data_weather=read.csv("all_weather.csv") # Weather (precipitation, temperature, dew point) data from PRISM
data_weather$DATE=as.Date(data_weather$DATE)
data_weather=data_weather[-1]
data_wind=read.csv("all_wind.csv") # Weather (wind) data from NOAA
data_wind=data_wind[-1]
data_wind$DATE=as.Date(data_wind$DATE)
#############################################
## Combine all data                        ##
#############################################
data_wind <- rbind(
  subset(data_wind, DATE<="2014-03-25" & DATE>="2013-10-22"),
  subset(data_wind, DATE<="2015-03-25" & DATE>="2014-10-22"),
  subset(data_wind, DATE<="2016-03-25" & DATE>="2015-10-22"),
  subset(data_wind, DATE<="2017-03-25" & DATE>="2016-10-22"),
  subset(data_wind, DATE<="2018-03-25" & DATE>="2017-10-22"),
  subset(data_wind, DATE<="2019-03-25" & DATE>="2018-10-22")
)

## Merge AOD and weather variables
data1=full_join(data_AOD, data_weather, by=c("Plant_Code", "DATE"))
data1=full_join(data1, data_wind, by=c("Plant_Code", "DATE"))

data1=data1[c("Plant_Code","DATE","AOD","ppt","tdmean","tmean","Uwind","Vwind")]

# Merge plant IDs
setwd("data/raw data")
plants_info=read.csv("powerplant_with_registryID.csv") ## registryID is obtained from EPA
plants_info=plants_info[-1]
plants_info=plants_info[-1]
data2=left_join(data1, plants_info,by=c("Plant_Code"))

data=data2

## Calculate wind speed
data$Wind=sqrt(data$Uwind^2+data$Vwind^2)



## Select the inspection variable
inspect=data_plant[c("Plant_Code", "CAA_FORMAL_ACTION_COUNT","CAA_QTRS_WITH_NC","FAC_INSPECTION_COUNT",
                     "FAC_FORMAL_ACTION_COUNT","FAC_PENALTY_COUNT","CAA_PENALTIES")]

  

## Merge inspection variable to dataset
data=left_join(data, inspect, by=c("Plant_Code"), all.x=T)
data$CAA_FORMAL_ACTION_COUNT[which(is.na(data$CAA_FORMAL_ACTION_COUNT))]=0
data$CAA_QTRS_WITH_NC[which(is.na(data$CAA_QTRS_WITH_NC))]=0
data$FAC_INSPECTION_COUNT[which(is.na(data$FAC_INSPECTION_COUNT))]=0
data$FAC_FORMAL_ACTION_COUNT[which(is.na(data$FAC_FORMAL_ACTION_COUNT))]=0
data$FAC_PENALTY_COUNT[which(is.na(data$FAC_PENALTY_COUNT))]=0
data$CAA_PENALTIES[which(is.na(data$CAA_PENALTIES))]=0


## Set time
TIME=as.data.frame(seq(as.Date("2018-10-22"), as.Date("2019-03-25"), 1))
names(TIME)[1]="DATE"
TIME$T=seq(1,155,1)
TIME$Time=paste(month(TIME$DATE),day(TIME$DATE), sep = "-" )
TIME=TIME[-1]
data$Time=paste(month(data$DATE),day(data$DATE), sep = "-" )
data=left_join(data,TIME, by="Time")
## DATA WITH ECHO-POWER PLANT-AOD-WEAHTER (CONTINENTIAL)
data=subset(data, StateName!="Puerto Rico" & StateName!="Hawaii" & StateName!="Alaska")

## Production data
AMPD=read.csv("data/raw data/epaampd_q1q4.csv") # emissions data from https://ampd.epa.gov/ampd/. 
names(AMPD)[3]="Plant_Code"
names(AMPD)[5]="DATE"
AMPD$DATE=as.character(AMPD$DATE)
AMPD$DATE=as.Date(paste(substr(AMPD$DATE,7,10), substr(AMPD$DATE,1,2) , substr(AMPD$DATE,4,5), sep = "-"))
AMPD=AMPD[c("Plant_Code","DATE","GLOAD..MWh.","SLOAD..1000.lbs.","SO2_MASS..tons.","NOX_MASS..tons.","CO2_MASS..tons.","HEAT_INPUT..mmBtu.")]
AMPD$GLOAD..MWh.[which(is.na(AMPD$GLOAD..MWh.))]=0
AMPD$SLOAD..1000.lbs.[which(is.na(AMPD$SLOAD..1000.lbs.))]=0
AMPD$SO2_MASS..tons.[which(is.na(AMPD$SO2_MASS..tons.))]=0
AMPD$NOX_MASS..tons.[which(is.na(AMPD$NOX_MASS..tons.))]=0
AMPD$CO2_MASS..tons.[which(is.na(AMPD$CO2_MASS..tons.))]=0
AMPD$HEAT_INPUT..mmBtu.[which(is.na(AMPD$HEAT_INPUT..mmBtu.))]=0
AMPD=aggregate(AMPD[c("GLOAD..MWh.","SLOAD..1000.lbs.","SO2_MASS..tons.","NOX_MASS..tons.","CO2_MASS..tons.","HEAT_INPUT..mmBtu.")], by=list(AMPD$Plant_Code,AMPD$DATE), FUN=sum)
names(AMPD)[1:2]=c("Plant_Code","DATE")
data=left_join(data, AMPD, by=c("Plant_Code","DATE"))
data$SO2_Rate_KG=data$SO2_MASS..tons./data$HEAT_INPUT..mmBtu.*1000
data$NOX_Rate_KG=data$NOX_MASS..tons./data$HEAT_INPUT..mmBtu.*1000
data$CO2_Rate_KG=data$CO2_MASS..tons./data$HEAT_INPUT..mmBtu.*1000

setwd("data/processed data")
save(data, file = "Data_Combined.RData")
